import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('Sleep_Efficiency.csv')
df
| ID | Age | Gender | Bedtime | Wakeup time | Sleep duration | Sleep efficiency | REM sleep percentage | Deep sleep percentage | Light sleep percentage | Awakenings | Caffeine consumption | Alcohol consumption | Smoking status | Exercise frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 65 | Female | 2021-03-06 01:00:00 | 2021-03-06 07:00:00 | 6.0 | 0.88 | 18 | 70 | 12 | 0.0 | 0.0 | 0.0 | Yes | 3.0 |
| 1 | 2 | 69 | Male | 2021-12-05 02:00:00 | 2021-12-05 09:00:00 | 7.0 | 0.66 | 19 | 28 | 53 | 3.0 | 0.0 | 3.0 | Yes | 3.0 |
| 2 | 3 | 40 | Female | 2021-05-25 21:30:00 | 2021-05-25 05:30:00 | 8.0 | 0.89 | 20 | 70 | 10 | 1.0 | 0.0 | 0.0 | No | 3.0 |
| 3 | 4 | 40 | Female | 2021-11-03 02:30:00 | 2021-11-03 08:30:00 | 6.0 | 0.51 | 23 | 25 | 52 | 3.0 | 50.0 | 5.0 | Yes | 1.0 |
| 4 | 5 | 57 | Male | 2021-03-13 01:00:00 | 2021-03-13 09:00:00 | 8.0 | 0.76 | 27 | 55 | 18 | 3.0 | 0.0 | 3.0 | No | 3.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 447 | 448 | 27 | Female | 2021-11-13 22:00:00 | 2021-11-13 05:30:00 | 7.5 | 0.91 | 22 | 57 | 21 | 0.0 | 0.0 | 0.0 | No | 5.0 |
| 448 | 449 | 52 | Male | 2021-03-31 21:00:00 | 2021-03-31 03:00:00 | 6.0 | 0.74 | 28 | 57 | 15 | 4.0 | 25.0 | 0.0 | No | 3.0 |
| 449 | 450 | 40 | Female | 2021-09-07 23:00:00 | 2021-09-07 07:30:00 | 8.5 | 0.55 | 20 | 32 | 48 | 1.0 | NaN | 3.0 | Yes | 0.0 |
| 450 | 451 | 45 | Male | 2021-07-29 21:00:00 | 2021-07-29 04:00:00 | 7.0 | 0.76 | 18 | 72 | 10 | 3.0 | 0.0 | 0.0 | No | 3.0 |
| 451 | 452 | 18 | Male | 2021-03-17 02:30:00 | 2021-03-17 10:00:00 | 7.5 | 0.63 | 22 | 23 | 55 | 1.0 | 50.0 | 0.0 | No | 1.0 |
452 rows × 15 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 452 entries, 0 to 451 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 452 non-null int64 1 Age 452 non-null int64 2 Gender 452 non-null object 3 Bedtime 452 non-null object 4 Wakeup time 452 non-null object 5 Sleep duration 452 non-null float64 6 Sleep efficiency 452 non-null float64 7 REM sleep percentage 452 non-null int64 8 Deep sleep percentage 452 non-null int64 9 Light sleep percentage 452 non-null int64 10 Awakenings 432 non-null float64 11 Caffeine consumption 427 non-null float64 12 Alcohol consumption 438 non-null float64 13 Smoking status 452 non-null object 14 Exercise frequency 446 non-null float64 dtypes: float64(6), int64(5), object(4) memory usage: 53.1+ KB
df.describe()
| ID | Age | Sleep duration | Sleep efficiency | REM sleep percentage | Deep sleep percentage | Light sleep percentage | Awakenings | Caffeine consumption | Alcohol consumption | Exercise frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 452.000000 | 452.000000 | 452.000000 | 452.000000 | 452.000000 | 452.000000 | 452.000000 | 432.000000 | 427.000000 | 438.000000 | 446.000000 |
| mean | 226.500000 | 40.285398 | 7.465708 | 0.788916 | 22.615044 | 52.823009 | 24.561947 | 1.641204 | 23.653396 | 1.173516 | 1.791480 |
| std | 130.625419 | 13.172250 | 0.866625 | 0.135237 | 3.525963 | 15.654235 | 15.313665 | 1.356762 | 30.202785 | 1.621377 | 1.428134 |
| min | 1.000000 | 9.000000 | 5.000000 | 0.500000 | 15.000000 | 18.000000 | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 113.750000 | 29.000000 | 7.000000 | 0.697500 | 20.000000 | 48.250000 | 15.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 226.500000 | 40.000000 | 7.500000 | 0.820000 | 22.000000 | 58.000000 | 18.000000 | 1.000000 | 25.000000 | 0.000000 | 2.000000 |
| 75% | 339.250000 | 52.000000 | 8.000000 | 0.900000 | 25.000000 | 63.000000 | 32.500000 | 3.000000 | 50.000000 | 2.000000 | 3.000000 |
| max | 452.000000 | 69.000000 | 10.000000 | 0.990000 | 30.000000 | 75.000000 | 63.000000 | 4.000000 | 200.000000 | 5.000000 | 5.000000 |
df.shape
(452, 15)
df.isna().sum()
ID 0 Age 0 Gender 0 Bedtime 0 Wakeup time 0 Sleep duration 0 Sleep efficiency 0 REM sleep percentage 0 Deep sleep percentage 0 Light sleep percentage 0 Awakenings 20 Caffeine consumption 25 Alcohol consumption 14 Smoking status 0 Exercise frequency 6 dtype: int64
df.rename(columns = {'Wakeup time':'Wakeup_time', 'Sleep duration':'Sleep_duration ',"Sleep efficiency":"Sleep_efficiency",
"REM sleep percentage":"REM_sleep_percentage","Deep sleep percentage":"Deep_sleep_percentage",
"Light sleep percentage":"Light_sleep_percentage","Caffeine consumption":"Caffeine_consumption",
"Alcohol consumption":"Alcohol_consumption","Smoking status":"Smoking_status","Exercise frequency":"Exercise_frequency"}, inplace = True)
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(method='spearman'), annot=True, fmt='.0%')
plt.show()
# Show the distribution of values in each column of the dataframe
df.hist(figsize=(20, 20))
plt.show()
plt.scatter(df.Age, df['Sleep_efficiency'])
plt.xlabel('age')
plt.ylabel('Sleep efficiency')
Text(0, 0.5, 'Sleep efficiency')
sns.histplot(df['Sleep_efficiency'],bins=10, kde=True)
<AxesSubplot:xlabel='Sleep_efficiency', ylabel='Count'>
df.Gender.value_counts()
Male 228 Female 224 Name: Gender, dtype: int64
sns.countplot(data=df,x="Gender", color="green")
plt.xlabel("Female or Male", color="blue",fontsize=10)
plt.ylabel("Count", color="blue",fontsize=10)
plt.show()
df.Age.value_counts()
52 19
27 19
37 18
48 17
40 17
..
15 1
11 1
9 1
16 1
12 1
Name: Age, Length: 61, dtype: int64
sns.relplot(
data=df, kind="line",
x="Age", y="Sleep_efficiency", style="Gender", color="purple"
)
plt.show()
df.Smoking_status.value_counts()
No 298 Yes 154 Name: Smoking_status, dtype: int64
sns.countplot(data=df,x="Smoking_status", color="pink")
plt.xlabel("Yes or No", color="blue",fontsize=10)
plt.ylabel("Count", color="blue",fontsize=10)
plt.title("number of smokers and non-smokers", color="blue",fontsize=10)
plt.show()
df.Exercise_frequency.value_counts()
3.0 130 0.0 116 1.0 97 2.0 54 4.0 41 5.0 8 Name: Exercise_frequency, dtype: int64
sns.kdeplot(data=df, x="Exercise_frequency",color="brown",fill=True)
plt.xlabel("Exercise Frequency", color="brown", fontsize=10)
plt.ylabel("frequency", color="brown", fontsize=10)
plt.title("Exercise Frequency Kdeplot", color="brown",fontsize=10)
plt.show()
df.Light_sleep_percentage.value_counts()
20 52 15 49 17 46 18 45 10 34 13 32 12 28 45 20 21 17 52 16 47 16 53 13 55 13 16 12 19 11 48 10 54 8 50 7 22 4 56 3 51 3 14 3 7 3 11 2 62 1 46 1 30 1 40 1 63 1 Name: Light_sleep_percentage, dtype: int64
sns.kdeplot(data=df, x="Light_sleep_percentage",color="gray",fill=True)
plt.xlabel("Light sleep percentage", color="gray", fontsize=10)
plt.ylabel("frequency", color="gray", fontsize=10)
plt.title("Light sleep percentage kdeplot", color="gray",fontsize=10)
plt.show()
df.REM_sleep_percentage.value_counts()
20 92 22 67 28 58 23 56 18 49 25 31 24 26 27 25 26 15 15 14 19 11 30 7 21 1 Name: REM_sleep_percentage, dtype: int64
sns.kdeplot(data=df, x="REM_sleep_percentage",color="black",fill=True)
plt.xlabel("Sleep Duration", color="black", fontsize=10)
plt.ylabel("frequency", color="black", fontsize=10)
plt.title("Sleep duration kdeplot", color="black",fontsize=10)
plt.show()
sns.boxplot(data=df,x="Caffeine_consumption",y="Sleep_efficiency", color="pink")
plt.title("Does caffeine consumption affect sleep?", color="black",fontsize=10)
plt.show()
sns.boxplot(data=df,x="Alcohol_consumption",y="Sleep_efficiency", color="blue")
plt.title("What is the effect of drinking alcohol on sleep efficiency?", color="blue",fontsize=10)
plt.show()
df
| ID | Age | Gender | Bedtime | Wakeup_time | Sleep_duration | Sleep_efficiency | REM_sleep_percentage | Deep_sleep_percentage | Light_sleep_percentage | Awakenings | Caffeine_consumption | Alcohol_consumption | Smoking_status | Exercise_frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 65 | Female | 2021-03-06 01:00:00 | 2021-03-06 07:00:00 | 6.0 | 0.88 | 18 | 70 | 12 | 0.0 | 0.0 | 0.0 | Yes | 3.0 |
| 1 | 2 | 69 | Male | 2021-12-05 02:00:00 | 2021-12-05 09:00:00 | 7.0 | 0.66 | 19 | 28 | 53 | 3.0 | 0.0 | 3.0 | Yes | 3.0 |
| 2 | 3 | 40 | Female | 2021-05-25 21:30:00 | 2021-05-25 05:30:00 | 8.0 | 0.89 | 20 | 70 | 10 | 1.0 | 0.0 | 0.0 | No | 3.0 |
| 3 | 4 | 40 | Female | 2021-11-03 02:30:00 | 2021-11-03 08:30:00 | 6.0 | 0.51 | 23 | 25 | 52 | 3.0 | 50.0 | 5.0 | Yes | 1.0 |
| 4 | 5 | 57 | Male | 2021-03-13 01:00:00 | 2021-03-13 09:00:00 | 8.0 | 0.76 | 27 | 55 | 18 | 3.0 | 0.0 | 3.0 | No | 3.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 447 | 448 | 27 | Female | 2021-11-13 22:00:00 | 2021-11-13 05:30:00 | 7.5 | 0.91 | 22 | 57 | 21 | 0.0 | 0.0 | 0.0 | No | 5.0 |
| 448 | 449 | 52 | Male | 2021-03-31 21:00:00 | 2021-03-31 03:00:00 | 6.0 | 0.74 | 28 | 57 | 15 | 4.0 | 25.0 | 0.0 | No | 3.0 |
| 449 | 450 | 40 | Female | 2021-09-07 23:00:00 | 2021-09-07 07:30:00 | 8.5 | 0.55 | 20 | 32 | 48 | 1.0 | NaN | 3.0 | Yes | 0.0 |
| 450 | 451 | 45 | Male | 2021-07-29 21:00:00 | 2021-07-29 04:00:00 | 7.0 | 0.76 | 18 | 72 | 10 | 3.0 | 0.0 | 0.0 | No | 3.0 |
| 451 | 452 | 18 | Male | 2021-03-17 02:30:00 | 2021-03-17 10:00:00 | 7.5 | 0.63 | 22 | 23 | 55 | 1.0 | 50.0 | 0.0 | No | 1.0 |
452 rows × 15 columns
df.set_index('ID', inplace=True,drop=True)
df.head()
| Age | Gender | Bedtime | Wakeup_time | Sleep_duration | Sleep_efficiency | REM_sleep_percentage | Deep_sleep_percentage | Light_sleep_percentage | Awakenings | Caffeine_consumption | Alcohol_consumption | Smoking_status | Exercise_frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | ||||||||||||||
| 1 | 65 | Female | 2021-03-06 01:00:00 | 2021-03-06 07:00:00 | 6.0 | 0.88 | 18 | 70 | 12 | 0.0 | 0.0 | 0.0 | Yes | 3.0 |
| 2 | 69 | Male | 2021-12-05 02:00:00 | 2021-12-05 09:00:00 | 7.0 | 0.66 | 19 | 28 | 53 | 3.0 | 0.0 | 3.0 | Yes | 3.0 |
| 3 | 40 | Female | 2021-05-25 21:30:00 | 2021-05-25 05:30:00 | 8.0 | 0.89 | 20 | 70 | 10 | 1.0 | 0.0 | 0.0 | No | 3.0 |
| 4 | 40 | Female | 2021-11-03 02:30:00 | 2021-11-03 08:30:00 | 6.0 | 0.51 | 23 | 25 | 52 | 3.0 | 50.0 | 5.0 | Yes | 1.0 |
| 5 | 57 | Male | 2021-03-13 01:00:00 | 2021-03-13 09:00:00 | 8.0 | 0.76 | 27 | 55 | 18 | 3.0 | 0.0 | 3.0 | No | 3.0 |
df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)
# df
df['Smoking_status'] = df['Smoking_status'].apply(lambda x: 0 if x == 'No' else 1)
df
| Age | Gender | Bedtime | Wakeup_time | Sleep_duration | Sleep_efficiency | REM_sleep_percentage | Deep_sleep_percentage | Light_sleep_percentage | Awakenings | Caffeine_consumption | Alcohol_consumption | Smoking_status | Exercise_frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | ||||||||||||||
| 1 | 65 | 1 | 2021-03-06 01:00:00 | 2021-03-06 07:00:00 | 6.0 | 0.88 | 18 | 70 | 12 | 0.0 | 0.0 | 0.0 | 1 | 3.0 |
| 2 | 69 | 0 | 2021-12-05 02:00:00 | 2021-12-05 09:00:00 | 7.0 | 0.66 | 19 | 28 | 53 | 3.0 | 0.0 | 3.0 | 1 | 3.0 |
| 3 | 40 | 1 | 2021-05-25 21:30:00 | 2021-05-25 05:30:00 | 8.0 | 0.89 | 20 | 70 | 10 | 1.0 | 0.0 | 0.0 | 0 | 3.0 |
| 4 | 40 | 1 | 2021-11-03 02:30:00 | 2021-11-03 08:30:00 | 6.0 | 0.51 | 23 | 25 | 52 | 3.0 | 50.0 | 5.0 | 1 | 1.0 |
| 5 | 57 | 0 | 2021-03-13 01:00:00 | 2021-03-13 09:00:00 | 8.0 | 0.76 | 27 | 55 | 18 | 3.0 | 0.0 | 3.0 | 0 | 3.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 448 | 27 | 1 | 2021-11-13 22:00:00 | 2021-11-13 05:30:00 | 7.5 | 0.91 | 22 | 57 | 21 | 0.0 | 0.0 | 0.0 | 0 | 5.0 |
| 449 | 52 | 0 | 2021-03-31 21:00:00 | 2021-03-31 03:00:00 | 6.0 | 0.74 | 28 | 57 | 15 | 4.0 | 25.0 | 0.0 | 0 | 3.0 |
| 450 | 40 | 1 | 2021-09-07 23:00:00 | 2021-09-07 07:30:00 | 8.5 | 0.55 | 20 | 32 | 48 | 1.0 | NaN | 3.0 | 1 | 0.0 |
| 451 | 45 | 0 | 2021-07-29 21:00:00 | 2021-07-29 04:00:00 | 7.0 | 0.76 | 18 | 72 | 10 | 3.0 | 0.0 | 0.0 | 0 | 3.0 |
| 452 | 18 | 0 | 2021-03-17 02:30:00 | 2021-03-17 10:00:00 | 7.5 | 0.63 | 22 | 23 | 55 | 1.0 | 50.0 | 0.0 | 0 | 1.0 |
452 rows × 14 columns
df = df.drop(['Bedtime','Wakeup_time'],axis=1)
df = df.dropna()
df.tail()
| Age | Gender | Sleep_duration | Sleep_efficiency | REM_sleep_percentage | Deep_sleep_percentage | Light_sleep_percentage | Awakenings | Caffeine_consumption | Alcohol_consumption | Smoking_status | Exercise_frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | ||||||||||||
| 446 | 30 | 1 | 7.5 | 0.53 | 28 | 20 | 52 | 4.0 | 50.0 | 2.0 | 1 | 1.0 |
| 448 | 27 | 1 | 7.5 | 0.91 | 22 | 57 | 21 | 0.0 | 0.0 | 0.0 | 0 | 5.0 |
| 449 | 52 | 0 | 6.0 | 0.74 | 28 | 57 | 15 | 4.0 | 25.0 | 0.0 | 0 | 3.0 |
| 451 | 45 | 0 | 7.0 | 0.76 | 18 | 72 | 10 | 3.0 | 0.0 | 0.0 | 0 | 3.0 |
| 452 | 18 | 0 | 7.5 | 0.63 | 22 | 23 | 55 | 1.0 | 50.0 | 0.0 | 0 | 1.0 |
y = df['Sleep_efficiency']
X = df.drop('Sleep_efficiency',axis=1)
X
| Age | Gender | Sleep_duration | REM_sleep_percentage | Deep_sleep_percentage | Light_sleep_percentage | Awakenings | Caffeine_consumption | Alcohol_consumption | Smoking_status | Exercise_frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||
| 1 | 65 | 1 | 6.0 | 18 | 70 | 12 | 0.0 | 0.0 | 0.0 | 1 | 3.0 |
| 2 | 69 | 0 | 7.0 | 19 | 28 | 53 | 3.0 | 0.0 | 3.0 | 1 | 3.0 |
| 3 | 40 | 1 | 8.0 | 20 | 70 | 10 | 1.0 | 0.0 | 0.0 | 0 | 3.0 |
| 4 | 40 | 1 | 6.0 | 23 | 25 | 52 | 3.0 | 50.0 | 5.0 | 1 | 1.0 |
| 5 | 57 | 0 | 8.0 | 27 | 55 | 18 | 3.0 | 0.0 | 3.0 | 0 | 3.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 446 | 30 | 1 | 7.5 | 28 | 20 | 52 | 4.0 | 50.0 | 2.0 | 1 | 1.0 |
| 448 | 27 | 1 | 7.5 | 22 | 57 | 21 | 0.0 | 0.0 | 0.0 | 0 | 5.0 |
| 449 | 52 | 0 | 6.0 | 28 | 57 | 15 | 4.0 | 25.0 | 0.0 | 0 | 3.0 |
| 451 | 45 | 0 | 7.0 | 18 | 72 | 10 | 3.0 | 0.0 | 0.0 | 0 | 3.0 |
| 452 | 18 | 0 | 7.5 | 22 | 23 | 55 | 1.0 | 50.0 | 0.0 | 0 | 1.0 |
388 rows × 11 columns
y
ID
1 0.88
2 0.66
3 0.89
4 0.51
5 0.76
...
446 0.53
448 0.91
449 0.74
451 0.76
452 0.63
Name: Sleep_efficiency, Length: 388, dtype: float64
def train_evaluate_model(model, X, y, n_splits=5, is_nn = False, epochs=100):
'''
arguments:
model -- A scikit-learn estimator instance for regression
X -- The input features for the model
y -- The target variable for the model
n_splits -- The number of folds to use in the KFold cross-validation (default=5)
Returns:
a dictionary of evaluation metrics for the model
'''
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# Initialize lists to store the evaluation metrics
mse_scores = []
r2_scores = []
# Use stratified k-fold cross-validation to evaluate the model
for train_index, test_index in kf.split(X, y):
X_train, y_train = X.iloc[train_index], y.iloc[train_index]
X_test, y_test = X.iloc[test_index], y.iloc[test_index]
# Fit the model to the training data
if is_nn:
model.fit(X_train, y_train , epochs=epochs,verbose=0)
model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model.predict(X_test)
# Compute the evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Append the scores to the lists
mse_scores.append(mse)
r2_scores.append(r2)
# Compute the mean and standard deviation of the evaluation metrics
mse_mean = np.mean(mse_scores)
mse_std = np.std(mse_scores)
r2_mean = np.mean(r2_scores)
r2_std = np.std(r2_scores)
# Create a dataframe to store the evaluation metrics
eval_df = pd.DataFrame({'Mean_Squared_Error': [mse_mean], 'MSE Std Dev': [mse_std], 'R_squared': [r2_mean], 'R2 Std Dev': [r2_std]})
return eval_df
lr = LinearRegression()
results = train_evaluate_model(lr, X, y, 10)
results.index = ['LinearRegression']
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
decision_tree = DecisionTreeRegressor()
decision_tree_results = train_evaluate_model(decision_tree, X, y, 10)
decision_tree_results.index = ['DecisionTree']
results = results.append(decision_tree_results)
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
| DecisionTree | 0.004250 | 0.000779 | 0.757620 | 0.063238 |
KNN = KNeighborsRegressor(n_neighbors=7)
knn = train_evaluate_model(KNN, X, y, 10)
knn.index =['KNearsNeighbors']
results = results.append(knn)
results.sort_values(by='Mean_Squared_Error',ascending=False).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| KNearsNeighbors | 0.004897 | 0.000606 | 0.720204 | 0.056080 |
| DecisionTree | 0.004250 | 0.000779 | 0.757620 | 0.063238 |
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
rfr= RandomForestRegressor()
rfr_result = train_evaluate_model(rfr, X, y, 10)
rfr_result.index = ['RandomForest']
results = results.append(rfr_result)
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| RandomForest | 0.002458 | 0.000620 | 0.858697 | 0.045146 |
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
| DecisionTree | 0.004250 | 0.000779 | 0.757620 | 0.063238 |
| KNearsNeighbors | 0.004897 | 0.000606 | 0.720204 | 0.056080 |
xgboost = xgb.XGBRegressor()
xgboost_result = train_evaluate_model(xgboost, X, y, 10)
xgboost_result.index = ['XGBoost']
results = results.append(xgboost_result)
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| RandomForest | 0.002458 | 0.000620 | 0.858697 | 0.045146 |
| XGBoost | 0.002716 | 0.000724 | 0.842738 | 0.054822 |
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
| DecisionTree | 0.004250 | 0.000779 | 0.757620 | 0.063238 |
| KNearsNeighbors | 0.004897 | 0.000606 | 0.720204 | 0.056080 |
catboost = CatBoostRegressor(verbose=False)
catboost_result = train_evaluate_model(catboost, X, y, 10)
catboost_result.index = ['Catboost']
results = results.append(catboost_result)
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| Catboost | 0.002388 | 0.000468 | 0.863124 | 0.036016 |
| RandomForest | 0.002458 | 0.000620 | 0.858697 | 0.045146 |
| XGBoost | 0.002716 | 0.000724 | 0.842738 | 0.054822 |
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
| DecisionTree | 0.004250 | 0.000779 | 0.757620 | 0.063238 |
| KNearsNeighbors | 0.004897 | 0.000606 | 0.720204 | 0.056080 |
lgbmr = LGBMRegressor()
lgbmr_result = train_evaluate_model(lgbmr, X, y, 10)
lgbmr_result.index = ['LGBM']
results = results.append(lgbmr_result)
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| LGBM | 0.002313 | 0.000537 | 0.867179 | 0.040736 |
| Catboost | 0.002388 | 0.000468 | 0.863124 | 0.036016 |
| RandomForest | 0.002458 | 0.000620 | 0.858697 | 0.045146 |
| XGBoost | 0.002716 | 0.000724 | 0.842738 | 0.054822 |
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
| DecisionTree | 0.004250 | 0.000779 | 0.757620 | 0.063238 |
| KNearsNeighbors | 0.004897 | 0.000606 | 0.720204 | 0.056080 |
regressors = [('lgbmr', LGBMRegressor()),('catboost', CatBoostRegressor(verbose=False)), ('xgb', xgb.XGBRegressor())]
voting_regressor = VotingRegressor(regressors)
voting_regressor_result = train_evaluate_model(voting_regressor, X, y, 10)
voting_regressor_result.index = ['Ensemble']
results = results.append(voting_regressor_result)
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
| Mean_Squared_Error | MSE Std Dev | R_squared | R2 Std Dev | |
|---|---|---|---|---|
| Ensemble | 0.002290 | 0.000536 | 0.868065 | 0.040479 |
| LGBM | 0.002313 | 0.000537 | 0.867179 | 0.040736 |
| Catboost | 0.002388 | 0.000468 | 0.863124 | 0.036016 |
| RandomForest | 0.002458 | 0.000620 | 0.858697 | 0.045146 |
| XGBoost | 0.002716 | 0.000724 | 0.842738 | 0.054822 |
| LinearRegression | 0.003825 | 0.000643 | 0.779134 | 0.062640 |
| DecisionTree | 0.004250 | 0.000779 | 0.757620 | 0.063238 |
| KNearsNeighbors | 0.004897 | 0.000606 | 0.720204 | 0.056080 |
sns.set_style('whitegrid')
sns.set_context('poster')
sns.set_palette('colorblind')
sns.set(rc={'figure.figsize':(12,8)})
sns.barplot(x=xgboost.feature_importances_, y=X.columns)
plt.title('FEATURE IMPORTANCE')
plt.show()
models = results.T.columns.values
fig = px.bar(
x=results.iloc[:9, 0].values,
y=models,
orientation='h',
color=results['Mean_Squared_Error'].iloc[:9],
color_continuous_scale='rdpu_r',
template="seaborn"
)
fig.update_layout(width=800, height=600,
xaxis=dict(title='Mean_Squared_Error'),
yaxis=dict(title="Models"))
fig.show()
fig = px.bar(
x=results.iloc[:9, 2].values,
y=models,
orientation='h',
color=results['R_squared'].iloc[:9],
color_continuous_scale='rdpu',
template="seaborn"
)
fig.update_layout(width=800, height=600,
xaxis=dict(title='R_squared'),
yaxis=dict(title="Models"))
fig.show()
fig = px.bar(
x=results.iloc[:9, 1].values,
y=models,
orientation='h',
color=results['MSE Std Dev'].iloc[:9],
color_continuous_scale='rdpu_r',
template="seaborn"
)
fig.update_layout(width=800, height=600,
xaxis=dict(title='MSE Std Dev'),
yaxis=dict(title="Models"))
fig.show()
fig = px.bar(
x=results.iloc[:9, 3].values,
y=models,
orientation='h',
color=results['R2 Std Dev'].iloc[:9],
color_continuous_scale='rdpu_r',
template="seaborn"
)
fig.update_layout(width=800, height=600,
xaxis=dict(title='R2 Std Dev'),
yaxis=dict(title="Models"))
fig.show()